import pandas as pd
import numpy as np

# 读取数据
data = pd.read_excel("../doc/datasource/C8-8.5-数据采集-clean.xlsx")

# 提取每一行的genre元素  -> 新的列表

# 删掉左中括号
data['type_list'] = data['type_list'].str.strip('[')
# 删掉右中括号
data['type_list'] = data['type_list'].str.strip(']')
# 处理NaN
data['type_list'] = data['type_list'].fillna(value='')

genre_list = []

for g in data['type_list']:
    g_list = g.split(', ')
    for l in g_list:
        genre_list.append(l)

# 给集合去重
g_list = list(set(genre_list))
# 去除空值

# g_list.remove('')

# 统计每个类型标签对应的电影数量、条数、频数
data_genre_tj = pd.DataFrame(np.zeros([len(g_list), 1]), index=g_list, columns=['统计'])

for i in data['type_list']:
    for label in g_list:
        if str(i).__contains__(label):
            data_genre_tj.loc[label, '统计'] += 1
print(data_genre_tj)
